/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * */ package org.biojava.nbio.structure.test.io; import org.biojava.nbio.structure.*; import org.biojava.nbio.structure.align.util.AtomCache; import org.biojava.nbio.structure.io.FileParsingParameters; import org.biojava.nbio.structure.io.LocalPDBDirectory.ObsoleteBehavior; import org.biojava.nbio.structure.quaternary.BioAssemblyInfo; import org.biojava.nbio.structure.xtal.CrystalCell; import org.junit.After; import org.junit.BeforeClass; import org.junit.ComparisonFailure; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.vecmath.Matrix4d; import java.io.*; import java.util.*; import static org.junit.Assert.*; /** * A test to make sure both PDB and mmCIF parsers can parse * properly large samples of the PDB. * * Will take very long to run, thus they are excluded by default in the pom. * To run them use, for the 1000 entries one: * <pre> * mvn -Dtest=TestLongPdbVsMmCifParsing#testLongPdbVsMmCif test * </pre> * or for the 10000 entries: * <pre> * mvn -Dtest=TestLongPdbVsMmCifParsing#testVeryLongPdbVsMmCif test * </pre> * * * @author duarte_j * */ public class TestLongPdbVsMmCifParsing { private static final Logger logger = LoggerFactory.getLogger(TestLongPdbVsMmCifParsing.class); private static final String TEST_LARGE_SET_FILE = "/random_1000_set.list"; private static final String TEST_VERY_LARGE_SET_FILE = "/random_10000_set.list"; private static final int DOTS_PER_LINE = 100; private static final float DELTA = 0.01f; private static final float DELTA_RESOLUTION = 0.01f; private static final float DELTA_RFREE = 0.01f; /** * The maximum number of PDBs for which we allow a mismatch of mol_ids (entity_ids) between PDB and mmCIF files * If more mismatches than this, the test will fail. * As of 2014.12.04 there are 7 mismatches */ private static final int MAX_ALLOWED_MOL_ID_MISMATCHES = 10; private static AtomCache cache; private static FileParsingParameters params; private String pdbId; private int countTested = 0; private HashSet<String> pdbIdsWithMismatchingMolIds; @BeforeClass public static void setUpBeforeClass() { cache = new AtomCache(); System.out.println("##### Starting long test. THIS CAN TAKE UP TO 1 HOUR TO COMPLETE!"); System.out.println("##### Using PDB/mmCIF cache dir: "+cache.getPath()); System.out.println("##### Each dot is a PDB entry being tested. "+DOTS_PER_LINE+" dots per line"); // disallow the use of the default /tmp dir, to make sure PDB_DIR is set if (cache.getPath().equals(System.getProperty("java.io.tmpdir")) || (cache.getPath().equals(System.getProperty("java.io.tmpdir")+File.separator)) ) { throw new IllegalArgumentException("PDB_DIR has not been set or it is set to the default temp directory. Please set PDB_DIR to run this test"); }; params = new FileParsingParameters(); cache.setFileParsingParams(params); cache.setObsoleteBehavior(ObsoleteBehavior.THROW_EXCEPTION); } @Test public void testLongPdbVsMmCif() throws IOException, StructureException { List<String> pdbIds = readTestSetFile(TEST_LARGE_SET_FILE); testAll(pdbIds); } @Test public void testVeryLongPdbVsMmCif() throws IOException, StructureException { List<String> pdbIds = readTestSetFile(TEST_VERY_LARGE_SET_FILE); testAll(pdbIds); } @Test public void testSingle() throws IOException, StructureException { testAll(Arrays.asList("4kro")); } @After public void printInfo() { if (pdbId!=null) System.out.println("\n##### ----> Last tested PDB entry was: "+pdbId + " ("+ countTested + " done so far)"); } private void testAll(List<String> pdbIds) throws IOException, StructureException { pdbIdsWithMismatchingMolIds = new HashSet<String>(); long start = System.currentTimeMillis(); System.out.println("##### Total of "+pdbIds.size()+" PDB entries to test"); for (int i = 0; i<pdbIds.size(); i++) { pdbId = pdbIds.get(i); countTested = i + 1; System.out.print("."); testSingleEntry(pdbId); if ( ( (i+1)%DOTS_PER_LINE )==0 ) System.out.println(); } pdbId = null; // to avoid printing the message if tests pass for all PDB entries long end = System.currentTimeMillis(); checkWarnings(); System.out.printf("\nDone in %5.1f minutes\n", (end-start)/60000.0); } private void checkWarnings() { if (pdbIdsWithMismatchingMolIds.size()>0) System.out.println("A total of "+pdbIdsWithMismatchingMolIds.size()+" PDB entries have mismatches in their Compound mol_ids (entity_ids)"); assertTrue("Mismatching mol_id (entity_id) between pdb and cif above the maximum allowed ("+MAX_ALLOWED_MOL_ID_MISMATCHES+")", pdbIdsWithMismatchingMolIds.size()<MAX_ALLOWED_MOL_ID_MISMATCHES); } private void testSingleEntry(String pdbId) throws IOException, StructureException { Structure sCif = getCifStructure(pdbId); Structure sPdb = getPdbStructure(pdbId); assertNotNull(sCif); assertNotNull(sPdb); try { testStructureMethods(sPdb, sCif); testHeader(sPdb, sCif); testChains(sPdb, sCif); } catch (ComparisonFailure e) { System.out.println("\nComparison failure! Values follow:"); System.out.println("Actual : "+e.getActual()); System.out.println("Expected: "+e.getExpected()); throw e; } } private void testStructureMethods(Structure sPdb, Structure sCif) { assertEquals("failed isNmr:",sPdb.isNmr(), sCif.isNmr()); assertEquals("failed isCrystallographic:",sPdb.isCrystallographic(), sCif.isCrystallographic()); assertEquals("failed nrModels:",sPdb.nrModels(), sCif.nrModels()); assertEquals("failed for getPdbCode:",sPdb.getPDBCode(),sCif.getPDBCode()); assertFalse(sPdb.isBiologicalAssembly()); assertFalse(sCif.isBiologicalAssembly()); // TODO journal article not parsed in mmCIF parser //assertEquals("failed hasJournalArticle",sPdb.hasJournalArticle(),sCif.hasJournalArticle()); // entity type should always be present for (EntityInfo e: sPdb.getEntityInfos()) { assertNotNull(e.getType()); } for (EntityInfo e: sCif.getEntityInfos()) { assertNotNull(e.getType()); } // entities: there's quite some inconsistencies here between pdb and cif: // sugar polymers are not in pdb at all: we avoid them boolean canCompareEntityCounts = true; for (EntityInfo e:sCif.getEntityInfos()) { if (e.getDescription().contains("SUGAR")) canCompareEntityCounts = false; } if (canCompareEntityCounts) { int entCountCif = 0; for (EntityInfo e: sCif.getEntityInfos()) { if (e.getType() == EntityType.POLYMER) entCountCif++; } int entCountPdb = 0; for (EntityInfo e:sPdb.getEntityInfos()) { if (e.getType() == EntityType.POLYMER) entCountPdb++; } assertEquals("failed number of non-sugar polymeric Entities pdb vs cif", entCountPdb, entCountCif); } // ss bonds // 4ab9 contains an error in ssbond in pdb file (misses 1 ssbond) // 2bdi contains also errors, the counts in both differ a lot 80 vs 92 if (!sPdb.getPDBCode().equals("4AB9") && !sPdb.getPDBCode().equals("2BDI")) assertEquals("number of ss bonds should coincide pdb vs cif", sPdb.getSSBonds().size(), sCif.getSSBonds().size()); } private void testHeader(Structure sPdb, Structure sCif) { PDBHeader hPdb = sPdb.getPDBHeader(); PDBHeader hCif = sCif.getPDBHeader(); boolean isNmr = sPdb.isNmr(); boolean isCrystallographic = sPdb.isCrystallographic(); assertNotNull(hPdb); assertNotNull(hCif); assertEquals("failed for PDB id (getIdCode)",hPdb.getIdCode(),hCif.getIdCode()); assertNotNull("pdb authors null",hPdb.getAuthors()); assertNotNull("cif authors null",hCif.getAuthors()); // I suppose 2 is a safe bet for authors length... assertTrue("authors length should be at least 2",hPdb.getAuthors().length()>=2); // for authors we strip spaces in case of ambiguities with names // there's too much variability in authors, commenting out, e.g. for 1zjo they don't coincide //assertEquals("failed getAuthors:", // hPdb.getAuthors().toLowerCase().replaceAll(" ", ""), // hCif.getAuthors().toLowerCase().replaceAll(" ", "")); assertNotNull("pdb classification null in pdb",hPdb.getClassification()); assertNotNull("cif classification null in cif",hCif.getClassification()); // there's too much variability in classification between pdb and mmcif, e.g. in 3ofb they don't coincide //assertEquals("failed getClassification:",hPdb.getClassification().toLowerCase(), hCif.getClassification().toLowerCase()); // description is set in CIF parser to same as classification (_struct_keywords.pdbx_keywords field) // while in PDB parser it is simply not set //assertNotNull("pdb description null",hPdb.getDescription()); assertNotNull("cif description null",hCif.getDescription()); //assertEquals("failed getDescription:",hPdb.getDescription().toLowerCase(), hCif.getDescription().toLowerCase()); assertEquals("failed getDepDate:",hPdb.getDepDate(), hCif.getDepDate()); assertEquals("failed getModDate:",hPdb.getModDate(), hCif.getModDate()); assertNotNull(hPdb.getExperimentalTechniques()); assertNotNull(hCif.getExperimentalTechniques()); assertTrue(hPdb.getExperimentalTechniques().size()>0); assertEquals("failed for getExperimentalTechniques",hPdb.getExperimentalTechniques(),hCif.getExperimentalTechniques()); // for some Electron Microscopy/Crystallography entries (e.g. 3iz2) the resolution in mmCIF is not present in the usual place if (!hPdb.getExperimentalTechniques().contains(ExperimentalTechnique.ELECTRON_CRYSTALLOGRAPHY) && !hPdb.getExperimentalTechniques().contains(ExperimentalTechnique.ELECTRON_MICROSCOPY)) { assertEquals("failed getResolution:",hPdb.getResolution(), hCif.getResolution(), DELTA_RESOLUTION); } // JRNL record is sometimes missing (e.g. 21bi) and thus is null, we can't test for nulls here in the general case //assertNotNull("journal article null",hPdb.getJournalArticle()); // TODO journal article not parsed in mmCIF parser // TODO when fixed in mmCIF parser, compare PDB to mmCIF values if not null //assertNotNull("journal article null",hCif.getJournalArticle()); assertNotNull("title null in pdb",hPdb.getTitle()); assertNotNull("title null in cif",hCif.getTitle()); // for titles we strip spaces in case of ambiguities with spacing assertEquals("failed for getTitle", hPdb.getTitle().toLowerCase().replaceAll(" ", ""), hCif.getTitle().toLowerCase().replaceAll(" ", "")); // tests specific to experimental techniques if (isNmr) { assertEquals("resolution is not the default value in NMR structure", PDBHeader.DEFAULT_RESOLUTION, hPdb.getResolution(), DELTA_RESOLUTION); } if (!isCrystallographic) { assertEquals("rfree is not the default value in non-crystallographic structure in pdb", PDBHeader.DEFAULT_RFREE, DELTA_RFREE, hPdb.getRfree()); assertEquals("rfree is not the default value in non-crystallographic structure in cif", PDBHeader.DEFAULT_RFREE, DELTA_RFREE, hCif.getRfree()); } if (isCrystallographic) { assertEquals("failed for Rfree:",hPdb.getRfree(), hCif.getRfree(), DELTA_RFREE); assertNotNull("getCrystallographicInfo is null in pdb",hPdb.getCrystallographicInfo()); assertNotNull("getCrystallographicInfo is null in cif",hCif.getCrystallographicInfo()); PDBCrystallographicInfo ciPdb = hPdb.getCrystallographicInfo(); PDBCrystallographicInfo ciCif = hCif.getCrystallographicInfo(); assertNotNull("space group null in pdb", ciPdb.getSpaceGroup()); assertNotNull("space group null in cif", ciCif.getSpaceGroup()); assertNotNull("crystal cell null in pdb",ciPdb.getCrystalCell()); assertNotNull("crystal cell null in cif",ciCif.getCrystalCell()); assertEquals("failed for space group short symbol pdb vs cif", ciPdb.getSpaceGroup().getShortSymbol(), ciCif.getSpaceGroup().getShortSymbol()); CrystalCell ccPdb = ciPdb.getCrystalCell(); CrystalCell ccCif = ciCif.getCrystalCell(); assertEquals("failed for cell A:",ccPdb.getA(),ccCif.getA(),DELTA); assertEquals("failed for cell B:",ccPdb.getB(),ccCif.getB(),DELTA); assertEquals("failed for cell C:",ccPdb.getC(),ccCif.getC(),DELTA); assertEquals("failed for cell Alpha:",ccPdb.getAlpha(),ccCif.getAlpha(),DELTA); assertEquals("failed for cell Beta:",ccPdb.getBeta(),ccCif.getBeta(),DELTA); assertEquals("failed for cell Gamma:",ccPdb.getGamma(),ccCif.getGamma(),DELTA); if (ciPdb.getNcsOperators()==null) { assertTrue(ciCif.getNcsOperators()==null); } else { Matrix4d[] ncsOpersPdb = ciPdb.getNcsOperators(); Matrix4d[] ncsOpersCif = ciCif.getNcsOperators(); assertEquals("Number of NCS operators don't coincide", ncsOpersPdb.length, ncsOpersCif.length); for (int i=0;i<ncsOpersPdb.length;i++) { assertTrue("NCS operators "+i+" don't coincide",ncsOpersPdb[i].epsilonEquals(ncsOpersCif[i], 0.0001)); } } } // biological assemblies // a) we don't test in non-crystallographic case because annotation is inconsistent between PDB and mmCIF, // e.g. 2kli (NMR) has bioassembly annotation in mmCIF but not in PDB // b) we don't test virus entries (we check via looking at ncs operators==null): // they are inconsistent PDB vs mmCIF (e.g. 1pgw has no oligomeric size in PDB, and 120 in mmCIF) if (isCrystallographic && hPdb.getCrystallographicInfo().getNcsOperators()==null // 1ruh, 2ms2, 2r06: virus proteins with data consistency issue: it's missing the MTRXn record (so it appears as ncs operators==null) && (!sPdb.getPDBCode().equalsIgnoreCase("1ruh")) && (!sPdb.getPDBCode().equalsIgnoreCase("2ms2")) && (!sPdb.getPDBCode().equalsIgnoreCase("2r06"))) { assertEquals("Number of bioassemblies doesn't coincide", hPdb.getNrBioAssemblies(), hCif.getNrBioAssemblies()); Map<Integer,BioAssemblyInfo> batPdb = hPdb.getBioAssemblies(); Map<Integer,BioAssemblyInfo> batCif = hCif.getBioAssemblies(); assertEquals("Size of bioassemblies map doesn't coincide with nr of bioassemblies", hPdb.getNrBioAssemblies(),batPdb.size()); assertEquals("Size of bioassemblies maps don't coincide",batPdb.size(), batCif.size()); for (int id:batPdb.keySet()) { assertTrue("Bioassembly id is not contained in mmCIF",batCif.containsKey(id)); // there's an inconsistency in 4amh pdb vs mmCIF in mmSize if (sPdb.getPDBCode().equalsIgnoreCase("4amh")) continue; assertEquals("Macromolecular size of assembly "+id+" doesn't coincide", batPdb.get(id).getMacromolecularSize(), batCif.get(id).getMacromolecularSize()); } } } private void testChains(Structure sPdb, Structure sCif) throws StructureException { assertNotNull(sPdb.getChains()); assertNotNull(sCif.getChains()); // sugar chains are badly annotated and inconsistent between pdb/mmcif // let's skip this test if we have sugar entities if (!containsSugar(sCif)) { assertEquals(sPdb.getPolyChains().size(), sCif.getPolyChains().size()); // some entries like 3c5e are inconsistent in residue numbering for UNL (unknown) residues between pdb and mmcif // skipping this test for them if (!containsUNL(sCif)) { assertEquals(sPdb.getNonPolyChains().size(), sCif.getNonPolyChains().size()); } assertEquals(sPdb.getWaterChains().size(), sCif.getWaterChains().size()); if (!containsUNL(sCif)) { assertEquals(sPdb.getChains().size(),sCif.getChains().size()); } } Set<String> chainIds = new TreeSet<String>(); for (Chain chain:sPdb.getPolyChains()){ chainIds.add(chain.getName()); } for (String chainId:chainIds) { testSingleChain(sPdb.getPolyChainByPDB(chainId), sCif.getPolyChainByPDB(chainId)); } } private void testSingleChain(Chain cPdb, Chain cCif) { assertNotNull(cPdb); assertNotNull(cCif); String chainId = cPdb.getName(); assertEquals("failed for getName():",cPdb.getName(),cCif.getName()); // TODO no internalChainID if parsed from PDB, should an ID be assigned following the same rules as in mmCIF? //assertEquals("failed for getInternalChainID():",cPdb.getInternalChainID(),cCif.getInternalChainID()); assertNotNull("getId is null",cCif.getId()); assertTrue("id used in mmCIF files must be at most 4 characters",cCif.getId().length()<=4); assertEquals("chainID must be 1 character only, failed for pdb", 1, cPdb.getName().length()); assertEquals("chainID must be 1 character only, failed for cif", 1, cCif.getName().length()); // getCompound() is some times null for badly formatted PDB files (e.g. 4a10, all waters are in a separate chain F) if (isPolymer(cPdb)) { assertNotNull("getCompound is null in pdb (chain "+chainId+")",cPdb.getEntityInfo()); assertNotNull("getCompound is null in cif (chain "+chainId+")",cCif.getEntityInfo()); // for some badly formatted entries there are mismatches of mol_ids on pdb cs mmcif, e.g. 2efw // we thus count them and only warn at the end int molIdPdb = cPdb.getEntityInfo().getMolId(); int molIdCif = cCif.getEntityInfo().getMolId(); if (molIdPdb!=molIdCif) { logger.warn("Mismatching mol_id (entity_id) for {}. pdb: {}, mmCIF: {}",pdbId,molIdPdb,molIdCif); pdbIdsWithMismatchingMolIds.add(pdbId); } } assertNotNull("getParent is null in pdb (chain "+chainId+")",cPdb.getStructure()); assertNotNull("getParent is null in cif (chain "+chainId+")",cCif.getStructure()); assertEquals("failed for getAtomLength (chain "+chainId+"):",cPdb.getAtomLength(),cCif.getAtomLength()); // entries with polymers composed of all unknowns (giving only-X sequences) can't be aligned seqres-to-atom (for PDB files) // we've got to skip them because they won't have seqres groups // e.g. is 1jnv chain A if (cPdb.getAtomSequence().matches("^X+$")) return; // note for getSeqResLength to work one needs the setAlignSeqRes option in the parsers assertEquals("failed for getSeqResLength pdb vs cif (chain "+chainId+"):", cPdb.getSeqResLength(),cCif.getSeqResLength()); assertEquals("failed for getSeqResGroups().size pdb vs cif", cPdb.getSeqResGroups().size(), cCif.getSeqResGroups().size()); assertEquals("getSeqResLength and getSeqResGroups.size should coincide in pdb:", cPdb.getSeqResLength(),cPdb.getSeqResGroups().size()); assertEquals("getSeqResLength and getSeqResGroups.size should coincide in cif:", cCif.getSeqResLength(),cCif.getSeqResGroups().size()); assertEquals("failed for getAtomLength:",cPdb.getAtomLength(),cCif.getAtomLength()); assertEquals("failed for getAtomGroups().size pdb vs cif", cPdb.getAtomGroups().size(), cCif.getAtomGroups().size()); assertEquals("getAtomLength and getAtomGroups.size should coincide in pdb:", cPdb.getAtomLength(),cPdb.getAtomGroups().size()); assertEquals("getAtomLength and getAtomGroups.size should coincide in cif:", cCif.getAtomLength(),cCif.getAtomGroups().size()); assertEquals("failed for getAtomGroups(GroupType.AMINOACID) pdb vs cif:", cPdb.getAtomGroups(GroupType.AMINOACID).size(),cCif.getAtomGroups(GroupType.AMINOACID).size()); assertEquals("failed for getAtomGroups(GroupType.HETATM) pdb vs cif:", cPdb.getAtomGroups(GroupType.HETATM).size(),cCif.getAtomGroups(GroupType.HETATM).size()); assertEquals("failed for getAtomGroups(GroupType.NUCLEOTIDE) pdb vs cif:", cPdb.getAtomGroups(GroupType.NUCLEOTIDE).size(),cCif.getAtomGroups(GroupType.NUCLEOTIDE).size()); // In 4imj, chain F there's an alignment ambiguity because of a repeat, so the seqres to atom alignment // doesn't work properly for it, we skip the rest of the test for this chain if (cPdb.getStructure().getPDBCode().equals("4IMJ") && cPdb.getName().equals("F")) return; assertEquals("failed for getSeqResGroups(GroupType.AMINOACID) pdb vs cif:", cPdb.getSeqResGroups(GroupType.AMINOACID).size(),cCif.getSeqResGroups(GroupType.AMINOACID).size()); assertEquals("failed for getAtomGroups(GroupType.HETATM) pdb vs cif:", cPdb.getSeqResGroups(GroupType.HETATM).size(),cCif.getSeqResGroups(GroupType.HETATM).size()); assertEquals("failed for getAtomGroups(GroupType.NUCLEOTIDE) pdb vs cif:", cPdb.getSeqResGroups(GroupType.NUCLEOTIDE).size(),cCif.getSeqResGroups(GroupType.NUCLEOTIDE).size()); assertTrue("getAtomLength must be at least 1 in length (chain "+chainId+")",cPdb.getAtomLength()>=1); if (isPolymer(cPdb)) { // some badly formatted PDB files (e.g. 4a10, all waters are in a separate chain F) have 0 seqres length for some chains assertTrue("getSeqResLength must be at least 1 in length (chain "+chainId+")",cPdb.getSeqResLength()>=1); } // in the current implementation this is not a valid test, entries that have aminoacid residues in // ligands, e.g. 3o6g won't pass this test //assertTrue("getSeqResLength ("+cPdb.getSeqResLength()+") must be >= than getAtomGroups(GroupType.AMINOACID).size() ("+ // cPdb.getAtomGroups(GroupType.AMINOACID).size()+") (chain "+chainName+")", // cPdb.getSeqResLength()>=cPdb.getAtomGroups(GroupType.AMINOACID).size()); int allAtomGroupsSizePdb = cPdb.getAtomGroups(GroupType.AMINOACID).size()+ cPdb.getAtomGroups(GroupType.HETATM).size()+ cPdb.getAtomGroups(GroupType.NUCLEOTIDE).size(); int allAtomGroupsSizeCif = cCif.getAtomGroups(GroupType.AMINOACID).size()+ cCif.getAtomGroups(GroupType.HETATM).size()+ cCif.getAtomGroups(GroupType.NUCLEOTIDE).size(); assertEquals("failed for sum of all atom group sizes (hetatm+nucleotide+aminoacid) pdb vs mmcif",allAtomGroupsSizePdb,allAtomGroupsSizeCif); assertEquals("failed for getAtomLength==hetatm+aminos+nucleotide",cPdb.getAtomLength(), allAtomGroupsSizePdb); int allSeqResGroupsSizePdb = cPdb.getSeqResGroups(GroupType.AMINOACID).size()+ cPdb.getSeqResGroups(GroupType.HETATM).size()+ cPdb.getSeqResGroups(GroupType.NUCLEOTIDE).size(); int allSeqResGroupsSizeCif = cCif.getSeqResGroups(GroupType.AMINOACID).size()+ cCif.getSeqResGroups(GroupType.HETATM).size()+ cCif.getSeqResGroups(GroupType.NUCLEOTIDE).size(); assertEquals("failed for sum of all seqres group sizes (hetatm+nucleotide+aminoacid) pdb vs mmcif",allSeqResGroupsSizePdb,allSeqResGroupsSizeCif); assertEquals("failed for getSeqResLength==hetatm+aminos+nucleotide",cPdb.getSeqResLength(), allSeqResGroupsSizePdb); } private Structure getPdbStructure(String pdbId) throws IOException, StructureException { cache.setUseMmCif(false); // set parsing params here: params.setAlignSeqRes(true); //params.setLoadChemCompInfo(true); params.setParseBioAssembly(true); return cache.getStructure(pdbId); } private Structure getCifStructure(String pdbId) throws IOException, StructureException { cache.setUseMmCif(true); // set parsing params here: params.setAlignSeqRes(true); //params.setLoadChemCompInfo(true); params.setParseBioAssembly(true); return cache.getStructure(pdbId); } /** * Reads a file containing a list of PDB codes. * Lines starting with "#" will be treated as comments * Will stop reading after finding an empty line, this is useful to quickly test a modified list. * @param testSetFile * @return * @throws IOException */ private List<String> readTestSetFile(String testSetFile) throws IOException { InputStream inStream = this.getClass().getResourceAsStream(testSetFile); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); List<String> list = new ArrayList<String>(); String line; while ((line=br.readLine())!=null) { if (line.startsWith("#")) continue; if (line.isEmpty()) break; if (!line.matches("\\d\\w\\w\\w")) throw new IllegalArgumentException("The input test set "+testSetFile+" contains an invalid PDB code: "+line); list.add(line); } br.close(); return list; } private boolean isPolymer(Chain chain) { for (Group group : chain.getSeqResGroups()) { if ((group instanceof AminoAcid) || (group instanceof NucleotideImpl)) { return true; } } // not a single amino-acid or nucleotide, must be something not polymeric return false; } private boolean containsSugar(Structure s) { for (EntityInfo e:s.getEntityInfos()) { if (e.getDescription().contains("SUGAR")) return true; } return false; } private boolean containsUNL(Structure s) { for (Chain c:s.getNonPolyChains()) { for (Group g:c.getAtomGroups()) { if (g.getPDBName().equals("UNL")) return true; } } return false; } }